Overview
There is mainly 3 parts to this story:
A simple easy to communicate model of the key relationship
A medium complexity smoothing analysis
A full power time series analysis with causal inference
The two data set used in this analysis are the Madison case and waste water concentration data.
## Date Site Cases N1 N1Error
## 435 2021-06-19 Madison 3 NA NA
## 436 2021-06-20 Madison 8 49443 8812
## 437 2021-06-21 Madison 1 90447 20429
## 438 2021-06-22 Madison 3 44587 12427
## 439 2021-06-23 Madison 4 14469 9155
## 440 2021-06-24 Madison NA 28023 7724
A simple display of the data shows the core components of this story. First that both data sets are extremely noisy. And that there is a hint of a relationship between the two signals.
MinMaxNorm <- function(Vec){#normalizes the data to range from 0 and 1
normVec <- (Vec-min(Vec,na.rm=TRUE))/max(Vec,na.rm=TRUE)
return(normVec)
}
NoNa <- function(DF,...){#Removes NA from the reverent columns
ColumnNames <- c(...)
NoNaDF <- DF%>%
filter(
across(
.cols = ColumnNames,
.fns = ~ !is.na(.x))
)
return(NoNaDF)
}
FillNA <- function(DF,...){#Fills NA with previous values
ColumnNames <- c(...)
NoNaDF <- DF%>%
fill(ColumnNames)
return(NoNaDF)
}
FirstImpression <- FullDF%>%
NoNa("N1","Cases")%>%#Removing outliers
ggplot(aes(x=Date))+#Data depends on time
geom_line(aes(y=MinMaxNorm(N1), color="N1",info=N1))+#compares N1 to Cases
geom_line(aes(y=MinMaxNorm(Cases), color="Cases",info=Cases))+
labs(y="variable min max normalized")
ggplotly(FirstImpression,tooltip=c("info","Date"))
Cross correlation and Granger Causality are key component to this analysis. Cross correlation looks at the correlation at a range of time shifts and Granger analysis preform a test for predictive power. We find that there is to much noise to find significance.
Cases <- FullDF$Cases
N1 <- FullDF$N1
ccf(Cases,N1,na.action=na.pass)
grangertest(Cases, N1, order = 1)
## Granger causality test
##
## Model 1: N1 ~ Lags(N1, 1:1) + Lags(Cases, 1:1)
## Model 2: N1 ~ Lags(N1, 1:1)
## Res.Df Df F Pr(>F)
## 1 168
## 2 169 -1 1.4783 0.2257
grangertest(N1,Cases, order = 1)
## Granger causality test
##
## Model 1: Cases ~ Lags(Cases, 1:1) + Lags(N1, 1:1)
## Model 2: Cases ~ Lags(Cases, 1:1)
## Res.Df Df F Pr(>F)
## 1 168
## 2 169 -1 4e-04 0.9837
IntermediateOutlierGraphic <- FALSE
DaySmoothed=21#Very wide smoothing to find where the data strong deviate from trend
FullDF2 <- FullDF%>%
mutate(N1 = ifelse(Date < mdy("11/20/2020"),NA,N1))
FullDF3 <- FullDF2%>%#Remove older data that clearly has no relationship to Cases
mutate(SmoothN1=rollapply(data = N1, width = DaySmoothed, FUN = median,
na.r = TRUE,fill=NA),#Finding very smooth version of the data with no outliers
SmoothN1=ifelse(is.na(SmoothN1),N1,SmoothN1),#Fixing issue where rollapply fills NA on right border
LargeError=N1>1.5*SmoothN1,#Calculating error Limits
N1=ifelse(LargeError,SmoothN1,N1))%>%#replacing data points that variance is to large
select(-SmoothN1,-LargeError)#Removing unneeded calculated columns
if(IntermediateOutlierGraphic){
OutlierGraphic <- FullDF%>%
mutate(SmoothN1=rollapply(data = N1, width = DaySmoothed, FUN = median,
na.r = TRUE,fill=NA),#creating smooth data
SmoothN1=ifelse(is.na(SmoothN1),N1,SmoothN1))%>%#Fixing issue where rollapply fills NA on right border)%>%
mutate(Outliers=Date < mdy("11/20/2020")|N1>1.5*SmoothN1)%>%
NoNa("N1","Cases")%>%#Removing outliers
ggplot(aes(x=Date))+#Data depends on time
geom_point(aes(y=MinMaxNorm(N1), color="N1",shape=Outliers,info=N1))+#compares N1 to Cases
geom_point(aes(y=MinMaxNorm(Cases), color="Cases",info=Cases))+
labs(y="variable min max normalized")
ggplotly(OutlierGraphic,tooltip=c("info","Date"))
}
FullDF3%>%
NoNa("N1","Cases")%>%#Removing outliers
ggplot(aes(x=Date))+#Data depends on time
geom_line(aes(y=MinMaxNorm(N1), color="N1"))+#compares N1 to Cases
geom_line(aes(y=MinMaxNorm(Cases), color="Cases"))+
labs(y="variable min max normalized")
library(tseries)
TestDF2 <- FullDF3%>%
FillNA("N1","Cases")%>%
NoNa("N1","Cases")
Cases <- TestDF2$Cases
N1 <- TestDF2$N1
# kpss.test(Cases)
# adf.test(Cases)
#
# kpss.test(N1)
# adf.test(N1)
ccf(Cases,N1,na.action=na.pass)
grangertest(Cases, N1, order = 1)
grangertest(N1,Cases, order = 1)
A key component to this relationship is that the relationship between N1 and Case involves a gamma distribution modeling both the time between catching Covid-19 and getting a test and the concentration of the shedded particles. We found a gamma distribution with mean 11.73 days and a sd of 7.68 match’s other research and gives good results.
SLDWidth <- 21
scale <- 5.028338
shape <- 2.332779 #These parameters are equivalent to the mean and sd above
weights <- dgamma(1:SLDWidth, scale = scale, shape = shape)
plot(weights,
main=paste("Gamma Distribution with mean = 11.73 days,and SD = 7.68"),
ylab = "Weight",
xlab = "Lag")
SLDSmoothedDF <- FullDF3%>%
mutate(
SLDCases = c(rep(NA,SLDWidth-1),#elimination of starting values not relevant as we have a 50+ day buffer of case data
rollapply(Cases,width=SLDWidth,FUN=weighted.mean,
w=weights,
na.rm = FALSE)))#no missing data to remove
SLDSmoothedDF%>%
NoNa("N1","SLDCases")%>%#same plot as earlier but with the SLD smoothing
ggplot(aes(x=Date))+
geom_line(aes(y=MinMaxNorm(SLDCases), color="SLDCases"))+
geom_line(aes(y=MinMaxNorm(N1), color="N1"))+
facet_wrap(~Site)+
labs(y="variable min max normalized")
The SLD Cause a signfigent improvement in the relationship now showing a statisticly signifigent test result
TestDF3 <- SLDSmoothedDF%>%
FillNA("N1","SLDCases")%>%
NoNa("N1","SLDCases")
SLDCases <- TestDF3$SLDCases
N1 <- TestDF3$N1
ccf(SLDCases,N1,na.action=na.pass)
grangertest(SLDCases, N1, order = 1)
## Granger causality test
##
## Model 1: N1 ~ Lags(N1, 1:1) + Lags(SLDCases, 1:1)
## Model 2: N1 ~ Lags(N1, 1:1)
## Res.Df Df F Pr(>F)
## 1 279
## 2 280 -1 8.3701 0.004115 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
grangertest(N1,SLDCases, order = 1)
## Granger causality test
##
## Model 1: SLDCases ~ Lags(SLDCases, 1:1) + Lags(N1, 1:1)
## Model 2: SLDCases ~ Lags(SLDCases, 1:1)
## Res.Df Df F Pr(>F)
## 1 279
## 2 280 -1 16.616 5.974e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
To isolate this relationship we used a primitive binning relationship. This clarifies the relationship we see hints of in the previous graphic and masks the noise in the data.
medianMean <- function(Vec){
return(mean(replace(Vec, c(which.min(Vec), which.max(Vec)), NA), na.rm = TRUE))
}
StartDate <- 1
DaySmoothing <- 14
Lag <- 4
BinDF <- SLDSmoothedDF%>%
select(Date, SLDCases, N1)%>%
mutate(MovedCases = data.table::shift(SLDCases, Lag),
Week=(as.numeric(Date)+StartDate)%/%DaySmoothing)%>%
group_by(Week)%>%
#filter(Week>2670)%>%
summarise(BinnedCases=mean(MovedCases, na.rm=TRUE), BinnedN1=exp(mean(log(N1), na.rm=TRUE)))
BinDF%>%
ggplot()+
geom_line(aes(x=Week, y=MinMaxNorm(BinnedN1), color="N1"))+
geom_line(aes(x=Week, y=MinMaxNorm(BinnedCases), color="Cases"))+
labs(y="Binned variable min max normalized")
BinDF%>%
ggplot()+
geom_point(aes(x=BinnedCases, y=BinnedN1))
cor(BinDF$BinnedN1, BinDF$BinnedCases, use="pairwise.complete.obs")
## [1] 0.5280613
summary(lm(BinnedCases~BinnedN1, data=BinDF))
##
## Call:
## lm(formula = BinnedCases ~ BinnedN1, data = BinDF)
##
## Residuals:
## Min 1Q Median 3Q Max
## -96.41 -48.94 -32.55 54.48 165.36
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.130e+01 2.631e+01 1.570 0.1322
## BinnedN1 4.650e-04 1.672e-04 2.781 0.0115 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 70.64 on 20 degrees of freedom
## (11 observations deleted due to missingness)
## Multiple R-squared: 0.2788, Adjusted R-squared: 0.2428
## F-statistic: 7.733 on 1 and 20 DF, p-value: 0.01153
To generate this relationship without reducing the amount of data we rely on a loess smoothing of the data. The loess smoothing is a way of generating smooth curves from noisy data. The displayed plot shows the visual power of this smoothing. We see a relationship in the big patterns but also multiple sub patterns match. We see in general that smoothed N1 both lags and leads the case data.
SLDSmoothedDF$loessN1 <- loessFit(y=(SLDSmoothedDF$N1),
x=SLDSmoothedDF$Date,
span=.2,
iterations=2)$fitted
SLDSmoothedDF%>%
NoNa("loessN1","SLDCases")%>%
ggplot()+
geom_line(aes(x=Date, y=MinMaxNorm(loessN1), color="loessN1"))+
geom_line(aes(x=Date, y=MinMaxNorm(SLDCases), color="SLDCases"))+
facet_wrap(~Site)+
labs(y="variable min max normalized")
The loess smoothing roughly doubled the correlation at all time lags. The two granger tests p value also got signifigently smaller
TestDF4 <- SLDSmoothedDF%>%
FillNA("loessN1","SLDCases")%>%
NoNa("loessN1","SLDCases")
SLDCases <- TestDF4$SLDCases
N1 <- TestDF4$loessN1
ccf(SLDCases,N1,na.action=na.pass)
grangertest(SLDCases, N1, order = 1)
## Granger causality test
##
## Model 1: N1 ~ Lags(N1, 1:1) + Lags(SLDCases, 1:1)
## Model 2: N1 ~ Lags(N1, 1:1)
## Res.Df Df F Pr(>F)
## 1 279
## 2 280 -1 89.183 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
grangertest(N1,SLDCases, order = 1)
## Granger causality test
##
## Model 1: SLDCases ~ Lags(SLDCases, 1:1) + Lags(N1, 1:1)
## Model 2: SLDCases ~ Lags(SLDCases, 1:1)
## Res.Df Df F Pr(>F)
## 1 279
## 2 280 -1 69.758 3.177e-15 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1